import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import os
df = pd.read_csv('Parkinsons.csv')
df.sample(5)
| name | MDVP:Fo(Hz) | MDVP:Fhi(Hz) | MDVP:Flo(Hz) | MDVP:Jitter(%) | MDVP:Jitter(Abs) | MDVP:RAP | MDVP:PPQ | Jitter:DDP | MDVP:Shimmer | ... | Shimmer:DDA | NHR | HNR | status | RPDE | DFA | spread1 | spread2 | D2 | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30 | phon_R01_S07_1 | 197.076 | 206.896 | 192.055 | 0.00289 | 0.00001 | 0.00166 | 0.00168 | 0.00498 | 0.01098 | ... | 0.01689 | 0.00339 | 26.775 | 0 | 0.422229 | 0.741367 | -7.348300 | 0.177551 | 1.743867 | 0.085569 |
| 86 | phon_R01_S21_3 | 178.222 | 202.450 | 141.047 | 0.00321 | 0.00002 | 0.00163 | 0.00194 | 0.00488 | 0.03759 | ... | 0.06219 | 0.03151 | 15.924 | 1 | 0.598714 | 0.712199 | -6.366916 | 0.335753 | 2.654271 | 0.144614 |
| 153 | phon_R01_S37_1 | 121.345 | 139.644 | 98.250 | 0.00684 | 0.00006 | 0.00388 | 0.00332 | 0.01164 | 0.02534 | ... | 0.04019 | 0.04179 | 21.520 | 1 | 0.566867 | 0.670475 | -4.865194 | 0.246404 | 2.013530 | 0.168581 |
| 80 | phon_R01_S20_3 | 96.106 | 108.664 | 84.510 | 0.00694 | 0.00007 | 0.00389 | 0.00415 | 0.01168 | 0.04024 | ... | 0.06799 | 0.01823 | 19.055 | 1 | 0.544805 | 0.770466 | -4.441519 | 0.155097 | 2.645959 | 0.327978 |
| 113 | phon_R01_S26_5 | 210.141 | 232.706 | 185.258 | 0.00534 | 0.00003 | 0.00321 | 0.00280 | 0.00964 | 0.01680 | ... | 0.02583 | 0.00620 | 23.671 | 1 | 0.441097 | 0.722254 | -5.963040 | 0.250283 | 2.489191 | 0.177807 |
5 rows × 24 columns
df.shape
(195, 24)
len(df)
195
df.dtypes
name object MDVP:Fo(Hz) float64 MDVP:Fhi(Hz) float64 MDVP:Flo(Hz) float64 MDVP:Jitter(%) float64 MDVP:Jitter(Abs) float64 MDVP:RAP float64 MDVP:PPQ float64 Jitter:DDP float64 MDVP:Shimmer float64 MDVP:Shimmer(dB) float64 Shimmer:APQ3 float64 Shimmer:APQ5 float64 MDVP:APQ float64 Shimmer:DDA float64 NHR float64 HNR float64 status int64 RPDE float64 DFA float64 spread1 float64 spread2 float64 D2 float64 PPE float64 dtype: object
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 195 entries, 0 to 194 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 195 non-null object 1 MDVP:Fo(Hz) 195 non-null float64 2 MDVP:Fhi(Hz) 195 non-null float64 3 MDVP:Flo(Hz) 195 non-null float64 4 MDVP:Jitter(%) 195 non-null float64 5 MDVP:Jitter(Abs) 195 non-null float64 6 MDVP:RAP 195 non-null float64 7 MDVP:PPQ 195 non-null float64 8 Jitter:DDP 195 non-null float64 9 MDVP:Shimmer 195 non-null float64 10 MDVP:Shimmer(dB) 195 non-null float64 11 Shimmer:APQ3 195 non-null float64 12 Shimmer:APQ5 195 non-null float64 13 MDVP:APQ 195 non-null float64 14 Shimmer:DDA 195 non-null float64 15 NHR 195 non-null float64 16 HNR 195 non-null float64 17 status 195 non-null int64 18 RPDE 195 non-null float64 19 DFA 195 non-null float64 20 spread1 195 non-null float64 21 spread2 195 non-null float64 22 D2 195 non-null float64 23 PPE 195 non-null float64 dtypes: float64(22), int64(1), object(1) memory usage: 36.7+ KB
df.isnull().sum()
name 0 MDVP:Fo(Hz) 0 MDVP:Fhi(Hz) 0 MDVP:Flo(Hz) 0 MDVP:Jitter(%) 0 MDVP:Jitter(Abs) 0 MDVP:RAP 0 MDVP:PPQ 0 Jitter:DDP 0 MDVP:Shimmer 0 MDVP:Shimmer(dB) 0 Shimmer:APQ3 0 Shimmer:APQ5 0 MDVP:APQ 0 Shimmer:DDA 0 NHR 0 HNR 0 status 0 RPDE 0 DFA 0 spread1 0 spread2 0 D2 0 PPE 0 dtype: int64
df.columns
Index(['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
'spread1', 'spread2', 'D2', 'PPE'],
dtype='object')
df.isna().sum()
name 0 MDVP:Fo(Hz) 0 MDVP:Fhi(Hz) 0 MDVP:Flo(Hz) 0 MDVP:Jitter(%) 0 MDVP:Jitter(Abs) 0 MDVP:RAP 0 MDVP:PPQ 0 Jitter:DDP 0 MDVP:Shimmer 0 MDVP:Shimmer(dB) 0 Shimmer:APQ3 0 Shimmer:APQ5 0 MDVP:APQ 0 Shimmer:DDA 0 NHR 0 HNR 0 status 0 RPDE 0 DFA 0 spread1 0 spread2 0 D2 0 PPE 0 dtype: int64
df.columns.value_counts()
name 1 MDVP:Fo(Hz) 1 D2 1 spread2 1 spread1 1 DFA 1 RPDE 1 status 1 HNR 1 NHR 1 Shimmer:DDA 1 MDVP:APQ 1 Shimmer:APQ5 1 Shimmer:APQ3 1 MDVP:Shimmer(dB) 1 MDVP:Shimmer 1 Jitter:DDP 1 MDVP:PPQ 1 MDVP:RAP 1 MDVP:Jitter(Abs) 1 MDVP:Jitter(%) 1 MDVP:Flo(Hz) 1 MDVP:Fhi(Hz) 1 PPE 1 dtype: int64
from ydata_profiling import ProfileReport
ProfileReport(df, title="Parkinson's Disease Profile Report")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]